renv

This project uses renv to manage CRAN packages. See here for how to get this working on your machine.

https://rstudio.github.io/renv/articles/collaborating.html

Imports

library(readr)
library(dplyr)
library(scales)
library(stringr)
library(ggplot2)
library(patchwork)
library(lubridate)
library(kableExtra)

Sources

source("utils.R")

File paths

malawi_per_sample_intro_handle <- "/Users/flashton/Dropbox/COVID/AAP_introductions/results/2022.12.02/malawian_introductions.excl_mwi_dups.txt"
national_cases_handle <- "/Users/flashton/Dropbox/COVID/our_world_in_data/2022.12.05/owid-covid-data.csv"
restrictions_handle <- "/Users/flashton/Dropbox/COVID/our_world_in_data/2022.12.05/international-travel-covid.csv"

vars to set

vocs_to_plot <- list("20H (Beta,V2)", "20I (Alpha,V1)", "21A (Delta)", "21I (Delta)", "21J (Delta)", "21K (Omicron)", "21L (Omicron)", "22A (Omicron)", "22B (Omicron)")
countries_to_include <- c("MWI", "KEN", "ZAF", "VNM", "THA", "IDN", "KHM")
threshold_for_plotting <- 10

Size distribtion by VOC

plot_size_distribution_by_VOC <- function(distinct_intros, country_iso){
  df_labels <- distinct_intros %>% group_by(voc_for_plots) %>% summarise(n = paste0("n = ",  n()))
  # will need to change the 200 below to something larger for countries with larger cluster sizes.
  p <- ggplot(distinct_intros, aes(x = voc_for_plots, y = cluster_size, colour = voc_for_plots)) +
    geom_jitter(alpha = 0.4, width = 0.3) + 
    theme(text=element_text(size=20), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
    stat_summary(fun = "median", colour = "black", shape = 95) +
    geom_label(data = df_labels, aes(y = Inf, label = n), label.size = 0, fill = NA, vjust = 1, size = 4) +
    scale_y_log10(limits = c(1, max(distinct_intros$cluster_size) + 200)) +
    ggtitle(paste(country_iso, 'introduction size', sep = '-'))
  print(p)
  #print(q)
  return(distinct_intros)
}

stats on distinct intros

basic_stats_on_distinct_intros <- function(distinct_intros){
  group_by(distinct_intros, voc_for_plots) %>% summarise(number_clusters = n(), number_genomes = sum(cluster_size), mean_cluster_size = mean(cluster_size, na.rm = TRUE), sd_cluster_size = sd(cluster_size, na.rm = TRUE), median_cluster_size = median(cluster_size, na.rm = TRUE), IQR_cluster_size = IQR(cluster_size, na.rm = TRUE)) %>% kbl() %>% kable_styling()
}

main function

If kruskal.test p-value is <0.05, then need to do pairwise wilcoxon rank sum tests.

If you want to do troubleshooting, and not run all the countries/genotypes, then change eval to FALSE.

national_cases <- read_in_national_case_numbers(national_cases_handle, countries_to_include)
#View(national_cases)
restrictions <- read_in_restrictions(restrictions_handle, countries_to_include)

run_main <- function(per_sample_intro_handle, country_iso, n_cases, restrictions, threshold_for_plotting){
  #print(per_sample_intro_handle)
  #print(country_iso)
  per_sample_output <- read_in_per_sample_intros(per_sample_intro_handle, threshold_for_plotting)
  per_sample_intros <- per_sample_output$per_sample_intros
  vocs_to_plot <- per_sample_output$vocs_to_plot
  #View(per_sample_output)
  distinct_intros <- distinct(per_sample_intros, introduction_node, .keep_all = TRUE)
  plot_size_distribution_by_VOC(distinct_intros, country_iso)
  basic_stats_on_distinct_intros(distinct_intros)
  print(kruskal.test(cluster_size ~ voc_for_plots, data = distinct_intros))
  #View(n_cases)
  make_overall_plots(per_sample_intros, n_cases, restrictions, country_iso)
  lapply(vocs_to_plot, make_per_VOC_plots, per_sample_intros = per_sample_intros, national_cases = n_cases, restrictions = restrictions, country_iso = country_iso)
}

#countries_to_plot <- c("MWI", "KEN")
#per_sample_intro_handles <- c("/Users/flashton/Dropbox/COVID/AAP_introductions/results/2022.12.02/malawian_introductions.excl_mwi_dups.txt", "/Users/flashton/Dropbox/COVID/AAP_introductions/results/2022.12.02/kenyan_introductions.txt")

countries_to_include <- c("MWI", "KEN", "ZAF", "VNM", "THA", "IDN", "KHM", "LAO")

#run_main("/Users/flashton/Dropbox/COVID/AAP_introductions/results/2022.12.02/cambodian_introductions.txt", "KHM", n_cases = national_cases, restrictions = restrictions, threshold_for_plotting = threshold_for_plotting)
#run_main("/Users/flashton/Dropbox/COVID/AAP_introductions/results/2022.12.02/indonesian_introductions.txt", "IDN", n_cases = national_cases, restrictions = restrictions, threshold_for_plotting = threshold_for_plotting)
#run_main("/Users/flashton/Dropbox/COVID/AAP_introductions/results/2022.12.02/kenyan_introductions.txt", "KEN", n_cases = national_cases, restrictions = restrictions, threshold_for_plotting = threshold_for_plotting)
#run_main("/Users/flashton/Dropbox/COVID/AAP_introductions/results/2022.12.02/laos_introductions.txt", "LAO", n_cases = national_cases, restrictions = restrictions, threshold_for_plotting = threshold_for_plotting)
run_main("/Users/flashton/Dropbox/COVID/AAP_introductions/results/2022.12.02/malawian_introductions.excl_mwi_dups.txt", country_iso = "MWI", n_cases = national_cases, restrictions = restrictions, threshold_for_plotting = threshold_for_plotting)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  cluster_size by voc_for_plots
## Kruskal-Wallis chi-squared = 6.6506, df = 4, p-value = 0.1555

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

#run_main("/Users/flashton/Dropbox/COVID/AAP_introductions/results/2022.12.02/southafrican_introductions.txt", "ZAF", n_cases = national_cases, restrictions = restrictions, threshold_for_plotting = threshold_for_plotting)
#run_main("/Users/flashton/Dropbox/COVID/AAP_introductions/results/2022.12.02/thai_introductions.txt", "THA", n_cases = national_cases, restrictions = restrictions, threshold_for_plotting = threshold_for_plotting)
#run_main("/Users/flashton/Dropbox/COVID/AAP_introductions/results/2022.12.02/vietnamese_introductions.txt", "VNM", n_cases = national_cases, restrictions = restrictions, threshold_for_plotting = threshold_for_plotting)

Individual plot, for development/troubleshooting

If you want to run it, then change eval to TRUE in the chunk options section.

#fig.width=14, fig.height=10
national_cases <- read_in_national_case_numbers(national_cases_handle, countries_to_include)

per_sample_output <- read_in_per_sample_intros("/Users/flashton/Dropbox/COVID/AAP_introductions/results/2022.12.02/malawian_introductions.txt", threshold_for_plotting)
per_sample_intros <- per_sample_output$per_sample_intros
vocs_to_plot <- per_sample_output$vocs_to_plot
restrictions <- read_in_restrictions(restrictions_handle, countries_to_include)
#View(intros)
country_iso <- 'MWI'
distinct_intros <- distinct(per_sample_intros, introduction_node, .keep_all = TRUE)
plot_size_distribution_by_VOC(distinct_intros, country_iso)
basic_stats_on_distinct_intros(distinct_intros)
print(kruskal.test(cluster_size ~ voc_for_plots, data = distinct_intros))

#first <- min(per_sample_intros$week)
#last <- max(per_sample_intros$week)
#fnl <- c(first, last)
#fi <- first_identification(per_sample_intros, '21I (Delta)', fnl)
#fi

#vocs_to_plot
make_overall_plots(per_sample_intros, national_cases, restrictions, "LAO")
make_per_VOC_plots(per_sample_intros, vocs_to_plot[[2]], national_cases, restrictions, "MWI")

todo list

Todo:

  1. if kw test p-val < 0.05, do pairwise wilcoxon rank
  2. filter out the ones with no sampling date before applying the thresholds?
  3. sort out problem with laos (NAs somewhere)
  4. set dynamic threshold for the “all” plot - only variants with some %age of the total samples should be plotted.